full_data <- readRDS('data/full_data.rds')
Monthly Production Data
prod_by_month <- full_data %>%
select(yearmonth, active_1km, monthly_oil_1km, monthly_gas_1km,
monthly_water_1km, monthly_boe_1km, active_2p5km, monthly_oil_2p5km,
monthly_gas_2p5km, monthly_water_2p5km, monthly_boe_2p5km, active_5km,
monthly_oil_5km, monthly_gas_5km, monthly_water_5km, monthly_boe_5km) %>%
distinct() %>%
filter(yearmonth <= '2023-03')
coef <- median(1032*prod_by_month$monthly_oil_1km/10^9)/median(prod_by_month$monthly_gas_1km/10^6)
full_data %>%
ggplot(aes(x = yearmonth, group = 1)) +
geom_line(aes(y = monthly_oil_1km/10^6, color = 'Oil'), size=1) +
geom_line(aes(y = 1032*monthly_gas_1km/10^9/coef, color = 'Natural Gas'), size=1) +
scale_y_continuous(
name = "Monthly Oil in Million Barrels",
sec.axis = sec_axis(~.*coef, name="Monthly Gas in Billion Cubic Feet")
) +
theme(
axis.title.y = element_text(color = 'tomato'),
axis.title.y.right = element_text(color = 'skyblue')
) +
scale_color_manual(values = c("skyblue", "tomato")) +
scale_x_discrete(breaks = prod_by_month$yearmonth[seq(1, length(prod_by_month$yearmonth), by = 7)]) +
xlab("Date") + labs(colour="Production Type") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 63301 rows containing missing values (`geom_line()`).
## Removed 63301 rows containing missing values (`geom_line()`).

Visualizing Daily Max H2S
H2S_dm <- full_data %>%
group_by(day, Monitor) %>%
summarise(H2S_daily_max = max(H2S, na.rm = TRUE)) %>%
mutate(H2S_daily_max = if_else(H2S_daily_max == -Inf, NA, H2S_daily_max))
## Warning: There were 123 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `H2S_daily_max = max(H2S, na.rm = TRUE)`.
## ℹ In group 1724: `day = 2020-08-22`, `Monitor = "Judson"`.
## Caused by warning in `max()`:
## ! no non-missing arguments to max; returning -Inf
## ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 122 remaining warnings.
## `summarise()` has grouped output by 'day'. You can override using the `.groups`
## argument.
H2S_ma <- H2S_dm %>%
mutate(yearmonth = format(day, "%Y-%m")) %>%
group_by(yearmonth, Monitor) %>%
summarise(H2S_monthly_average = mean(H2S_daily_max, na.rm = TRUE))
## `summarise()` has grouped output by 'yearmonth'. You can override using the
## `.groups` argument.
H2S_dm_graph <- H2S_dm %>%
ggplot(aes(x=day, y=H2S_daily_max, group=Monitor, color=Monitor)) +
geom_line() +
labs(title = "Daily Max H2S concentration by monitor", y = 'Daily Max H2S', x = 'time') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(H2S_dm_graph) %>% layout(dragmode = 'pan')
H2S_dm_graph_b <- H2S_dm %>%
ggplot(aes(x=day, y=H2S_daily_max, group=Monitor, color=Monitor)) +
geom_line() +
labs(title = "Daily Max H2S concentration by monitor w/o outlier", y = 'Daily Max H2S', x = 'time') +
scale_y_continuous(limits = c(0, 100)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(H2S_dm_graph_b) %>% layout(dragmode = 'pan')
H2S_ma_graph <- H2S_ma %>%
ggplot(aes(x=yearmonth, y=H2S_monthly_average, group=Monitor, color=Monitor)) +
geom_line() +
labs(title = "Monthly Average H2S concentration by monitor", y = 'Monthly Average H2S', x = 'time') +
scale_x_discrete(breaks = unique(H2S_ma$yearmonth)[seq(1, length(unique(H2S_ma$yearmonth)), by = 6)]) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(H2S_ma_graph) %>% layout(dragmode = 'pan')
H2S_ma_graph_b <- H2S_ma %>%
ggplot(aes(x=yearmonth, y=H2S_monthly_average, group=Monitor, color=Monitor)) +
geom_line() +
labs(title = "Monthly Average H2S concentration by monitor w/o outlier", y = 'Monthly Average H2S', x = 'time') +
scale_x_discrete(breaks = unique(H2S_ma$yearmonth)[seq(1, length(unique(H2S_ma$yearmonth)), by = 6)]) +
scale_y_continuous(limits=c(0, 50)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(H2S_ma_graph_b) %>% layout(dragmode = 'pan')
The line with the largest deviation/peak corresponds to the
213th&Chico monitor, in 2021-10 Maybe we should start from January
2022 to start modelling to remove the extreme values, or simply removing
the 213 monitor since that’s set up specifically for the event.
Visualizing Daily Average H2S
# Compute daily average
H2S_da <- full_data %>%
group_by(day, Monitor) %>%
summarise(H2S_daily_avg = mean(H2S, na.rm=TRUE))
## `summarise()` has grouped output by 'day'. You can override using the `.groups`
## argument.
H2S_da_graph <- H2S_da %>%
ggplot(aes(x=day, y=H2S_daily_avg, group=Monitor, color=Monitor)) +
geom_line() +
labs(title = "Daily Average H2S concentration by monitor", y = 'Daily Average H2S', x = 'time') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(H2S_da_graph) %>% layout(dragmode = 'pan')
H2S_da_graph_b <- H2S_da %>%
ggplot(aes(x=day, y=H2S_daily_avg, group=Monitor, color=Monitor)) +
geom_line() +
labs(title = "Daily Average H2S concentration by monitor w/o outlier", y = 'Daily Avearge H2S', x = 'time') +
scale_y_continuous(limits = c(0, 100)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(H2S_dm_graph_b) %>% layout(dragmode = 'pan')
full_data %>%
polarPlot(pollutant = "H2S", col = "turbo",
key.position = "bottom",
key.header = "mean H2S",
key.footer = NULL, title = 'hi')

# Create a polar map
# TBD: include markers for refineries
polarplot <- polarMap(
full_data %>%
filter(!(is.na(latitude.x) | is.na(H2S) | is.na(wd) | is.na(ws))) %>%
rename(date = DateTime),
pollutant = "H2S",
latitude = "latitude.x",
longitude = "longitude.x",
popup = "Monitor",
provider = "Esri.WorldImagery",
d.icon = 150,
d.fig = 2.5,
alpha = 0.75
)
##
Creating Polar Markers â– â– â– â– â– â– 15% | ETA: 9s
Creating Polar Markers â– â– â– â– â– â– â– â– 23%
## | ETA: 9s
Creating Polar Markers â– â– â– â– â– â– â– â– â– â– 31% | ETA: 7s
Creating Polar Markers
## â– â– â– â– â– â– â– â– â– â– â– â– â– 38% | ETA: 7s
Creating Polar Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– 46% | ETA:
## 5s
Creating Polar Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– 54% | ETA: 5s
Creating Polar Markers
## â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– 62% | ETA: 4s
Creating Polar Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â–
## 69% | ETA: 3s
Creating Polar Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– 77% | ETA:
## 2s
Creating Polar Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– 85% | ETA: 2s
Creating Polar
## Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– 92% | ETA: 1s
polarplot
#saveWidget(polarplot, file="polarplot.html")
Downwind Indicator wrp Refinery
# Create variable to indicate downwind wrt refinery
wind_diff <- abs(full_data$Converted_Angle - 180 - full_data$wd)
wind_diff <- ifelse(wind_diff > 180, 360 - wind_diff, wind_diff)
full_data$downwind_ref <- as.integer(wind_diff <= 15)
Find angle between monitor and nearest water treatment plant
# Function to calculate the angle between two lat/long points
calculate_angle <- function(lat1, lon1, lat2, lon2) {
# Calculate the bearing between the two points
angle <- bearing(cbind(lon1, lat1), cbind(lon2, lat2))
# Return the angle
return(angle)
}
wrp <- full_data %>%
select(name, wrp_utm_x, wrp_utm_y) %>%
distinct()
wrp_coord <- cbind(wrp, terra::project(as.matrix(cbind(wrp$wrp_utm_x, wrp$wrp_utm_y)),
"+proj=utm +zone=11 ellps=WGS84",
"+proj=longlat +datum=WGS84")) %>%
rename(longitude.wrp = '1',
latitude.wrp = '2')
full_data <- full_data %>%
left_join(wrp_coord %>% select(name, latitude.wrp, longitude.wrp), join_by(name), keep=F)
mon_wrp <- full_data %>%
select(Monitor, name, latitude.x, longitude.x, latitude.wrp, longitude.wrp) %>%
distinct()
mon_wrp$wrp_angle <- calculate_angle(mon_wrp$latitude.x, mon_wrp$longitude.x,
mon_wrp$latitude.wrp, mon_wrp$longitude.wrp)
mon_wrp$wrp_angle <- if_else(mon_wrp$wrp_angle < 0, mon_wrp$wrp_angle + 360, mon_wrp$wrp_angle)
full_data <- full_data %>%
left_join(mon_wrp %>% select(Monitor, name, wrp_angle), join_by(Monitor, name), keep=F)
# Finally, check if wind direction is downwind w.r.t wrp
wind_diff <- abs(full_data$wrp_angle - full_data$wd)
wind_diff <- ifelse(wind_diff > 180, 360 - wind_diff, wind_diff)
full_data$downwind_wrp <- as.integer(wind_diff <= 15)
# Compute daily average wd/ws
daily_full <- full_data %>%
select(H2S, ws, wd, latitude.x, longitude.x, mon_utm_x, mon_utm_y, Monitor, MinDist,
Converted_Angle, Refinery, year, month, day, weekday,
monthly_oil_1km, monthly_gas_1km, monthly_oil_2p5km,
monthly_gas_2p5km, monthly_oil_5km, monthly_gas_5km,
dist_wrp, capacity, active_1km, active_2p5km, active_5km,
wrp_angle) %>%
group_by(Monitor, day) %>%
mutate(H2S_daily_max = max(H2S, na.rm=TRUE),
H2S_daily_avg = mean(H2S, na.rm=TRUE),
ws_avg = mean(ws, na.rm=TRUE),
wd_avg = as.numeric(mean(circular(wd, units = 'degrees'), na.rm=TRUE))) %>%
mutate(wd_avg = if_else(wd_avg < 0, wd_avg+360, wd_avg)) %>%
ungroup() %>%
rename(monitor_lat = latitude.x, monitor_lon = longitude.x) %>%
mutate(H2S_daily_max = ifelse(H2S_daily_max == -Inf, NA, H2S_daily_max)) %>%
select(-c(H2S, wd, ws)) %>%
distinct()
## Warning: There were 2449 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `H2S_daily_max = max(H2S, na.rm = TRUE)`.
## ℹ In group 1637: `Monitor = "First Methodist"`, `day = 2022-11-23`.
## Caused by warning in `max()`:
## ! no non-missing arguments to max; returning -Inf
## ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 2448 remaining warnings.
# Get the downwind indicators for daily data
wind_diff <- abs(daily_full$Converted_Angle - 180 - daily_full$wd_avg)
wind_diff <- ifelse(wind_diff > 180, 360 - wind_diff, wind_diff)
daily_full$daily_downwind_ref <- as.integer(wind_diff <= 15)
wind_diff <- abs(daily_full$wrp_angle - daily_full$wd_avg)
wind_diff <- ifelse(wind_diff > 180, 360 - wind_diff, wind_diff)
daily_full$daily_downwind_wrp <- as.integer(wind_diff <= 15)
Get monitor locations
monitors_coord <- full_data %>%
select(Monitor, latitude.x, longitude.x) %>%
distinct()
monitors <- monitors_coord %>%
select(Monitor)
coordinates(monitors_coord) <- ~ longitude.x + latitude.x
Load in elevation data
elevation <- raster('shapefiles/N33W119.hgt')
monitors <- cbind(monitors, extract(elevation, monitors_coord)) %>%
as.data.frame() %>%
rename(elevation = 2)
Load in EVI data
evi <- raster('shapefiles/MOD13Q1.006__250m_16_days_EVI_doy2022177_aid0001.tif')
monitors <- cbind(monitors, extract(evi, monitors_coord) * 0.0001) %>%
as.data.frame() %>%
rename(EVI = 3)
Merge EVI and elevation to data
full_data <- full_data %>%
left_join(monitors, join_by(Monitor))
daily_full <- daily_full %>%
left_join(monitors, join_by(Monitor))
Merge Odor Complaint data
odor <- read_csv('data/SCAQMD_odorcomplaints_2018_2022.csv')
## New names:
## Rows: 19913 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (3): ...1, zip, num_odor_complaints date (1): date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
la_county <- read_sf('shapefiles/Zip_Codes_(LA_County)/Zip_Codes_(LA_County).shp')
# https://gis.stackexchange.com/questions/282750/identify-polygon-containing-point-with-r-sf-package
pnts_sf <- do.call("st_sfc",c(lapply(1:nrow(monitors_coord@coords),
function(i) {st_point(as.numeric(monitors_coord@coords[i, ]))}), list("crs" = 4326)))
pnts_trans <- st_transform(pnts_sf, 2163) # apply transformation to pnts sf
la_county_trans <- st_transform(la_county, 2163) # apply transformation to polygons sf
# intersect and extract state name
monitors_coord@data$county <- apply(st_intersects(la_county_trans, pnts_trans, sparse = FALSE), 2,
function(col) {
la_county_trans[which(col), ]$ZIPCODE
})
# Plot results to double check
la_county_present <- la_county[la_county$ZIPCODE %in% unique(monitors_coord$county), ]
la_county_present %>%
ggplot() +
geom_sf(aes(fill = ZIPCODE)) +
geom_point(data = as_tibble(monitors_coord@coords), aes(x = longitude.x, y = latitude.x))

odor <- odor[odor$zip %in% unique(monitors_coord$county), ]
monitor_odor <- monitors_coord@data %>%
left_join(odor %>% mutate(zip = as.character(zip)), join_by(county == zip)) %>%
select(-`...1`)
## Warning in left_join(., odor %>% mutate(zip = as.character(zip)), join_by(county == : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 894 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
full_data <- full_data %>%
left_join(monitors_coord@data, join_by(Monitor))
daily_full <- daily_full %>%
left_join(monitors_coord@data, join_by(Monitor))
full_data <- full_data %>%
left_join(monitor_odor %>% select(Monitor, date, num_odor_complaints), join_by(Monitor, day == date))
full_data <- full_data %>%
mutate(num_odor_complaints = coalesce(num_odor_complaints, 0))
daily_full <- daily_full %>%
left_join(monitor_odor %>% select(Monitor, date, num_odor_complaints), join_by(Monitor, day == date))
daily_full <- daily_full %>%
mutate(num_odor_complaints = coalesce(num_odor_complaints, 0))
Explore some GAM models
Five minute H2S
h2s_model_a <- gam(H2S~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2),
data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_a)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws +
## I(1/MinDist^2)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.172e+00 3.213e-03 364.72 <2e-16 ***
## year2023 -3.263e-01 2.800e-03 -116.54 <2e-16 ***
## wd_secQ2 -2.560e-01 3.751e-03 -68.23 <2e-16 ***
## wd_secQ3 -2.898e-01 3.892e-03 -74.46 <2e-16 ***
## wd_secQ4 -2.095e-01 3.492e-03 -59.99 <2e-16 ***
## ws -5.424e-02 4.176e-04 -129.91 <2e-16 ***
## I(1/MinDist^2) 2.114e+05 2.329e+03 90.77 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.996 8 2496 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0946 Deviance explained = 9.47%
## GCV = 0.92965 Scale est. = 0.92963 n = 772251
plot(h2s_model_a)

h2s_model_b <- gam(H2S~s(as.numeric(month),bs='cc')+wd_sec+ws+I(1/MinDist^2)+
s(latitude.x, longitude.x, bs='tp', k = 8),
data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_b)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + wd_sec + ws + I(1/MinDist^2) +
## s(latitude.x, longitude.x, bs = "tp", k = 8)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.012e+00 3.171e-03 319.16 <2e-16 ***
## wd_secQ2 -1.954e-01 3.696e-03 -52.87 <2e-16 ***
## wd_secQ3 -1.822e-01 3.905e-03 -46.65 <2e-16 ***
## wd_secQ4 -1.129e-01 3.487e-03 -32.39 <2e-16 ***
## ws -6.921e-02 4.147e-04 -166.90 <2e-16 ***
## I(1/MinDist^2) 3.002e+05 2.858e+03 105.05 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.999 8 1708 <2e-16 ***
## s(latitude.x,longitude.x) 6.999 7 6523 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.13 Deviance explained = 13%
## GCV = 0.8932 Scale est. = 0.89317 n = 772251
plot(h2s_model_b)


# Include converted angle and weekday
h2s_model_c <- gam(H2S ~ s(as.numeric(month),bs='cc') + year + wd_sec + ws +
I(1/MinDist^2) + Converted_Angle +
s(latitude.x, longitude.x, bs='tp', k = 10) +
as.factor(weekday),
data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_c)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws +
## I(1/MinDist^2) + Converted_Angle + s(latitude.x, longitude.x,
## bs = "tp", k = 10) + as.factor(weekday)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.420e+00 2.374e-02 101.947 < 2e-16 ***
## year2023 -3.327e-01 2.864e-03 -116.135 < 2e-16 ***
## wd_secQ2 -1.858e-01 3.643e-03 -51.011 < 2e-16 ***
## wd_secQ3 -1.897e-01 3.846e-03 -49.325 < 2e-16 ***
## wd_secQ4 -1.090e-01 3.441e-03 -31.663 < 2e-16 ***
## ws -6.128e-02 4.076e-04 -150.319 < 2e-16 ***
## I(1/MinDist^2) 1.112e-04 1.961e-06 56.689 < 2e-16 ***
## Converted_Angle -5.775e-03 1.129e-04 -51.138 < 2e-16 ***
## as.factor(weekday).L 9.867e-02 2.801e-03 35.228 < 2e-16 ***
## as.factor(weekday).Q -1.699e-01 2.808e-03 -60.505 < 2e-16 ***
## as.factor(weekday).C -3.883e-03 2.803e-03 -1.385 0.166
## as.factor(weekday)^4 -2.054e-02 2.814e-03 -7.299 2.9e-13 ***
## as.factor(weekday)^5 -5.798e-02 2.807e-03 -20.654 < 2e-16 ***
## as.factor(weekday)^6 -2.507e-02 2.820e-03 -8.891 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.997 8 2708 <2e-16 ***
## s(latitude.x,longitude.x) 8.999 9 6588 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 30/31
## R-sq.(adj) = 0.156 Deviance explained = 15.6%
## GCV = 0.86621 Scale est. = 0.86618 n = 772251
plot(h2s_model_c)


# Include converted angle and weekday
h2s_model_d <- gam(H2S ~
s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
wd_sec + ws + downwind_ref + downwind_wrp +
I(1/MinDist^2) + dist_wrp + capacity +
Converted_Angle +
s(latitude.x, longitude.x, bs='tp', k = 10) +
monthly_oil_1km + monthly_gas_1km + active_1km
,
data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_d)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) +
## wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) +
## dist_wrp + capacity + Converted_Angle + s(latitude.x, longitude.x,
## bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km +
## active_1km
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.757e+00 5.816e-02 -47.406 < 2e-16 ***
## year2023 -1.356e-01 6.865e-03 -19.751 < 2e-16 ***
## as.factor(weekday).L 1.082e-01 2.989e-03 36.201 < 2e-16 ***
## as.factor(weekday).Q -1.838e-01 2.995e-03 -61.371 < 2e-16 ***
## as.factor(weekday).C -8.157e-04 2.981e-03 -0.274 0.78432
## as.factor(weekday)^4 -1.475e-02 2.983e-03 -4.945 7.61e-07 ***
## as.factor(weekday)^5 -5.943e-02 2.975e-03 -19.979 < 2e-16 ***
## as.factor(weekday)^6 -2.875e-02 2.985e-03 -9.631 < 2e-16 ***
## wd_secQ2 -1.520e-01 3.935e-03 -38.615 < 2e-16 ***
## wd_secQ3 -1.310e-01 4.161e-03 -31.492 < 2e-16 ***
## wd_secQ4 -7.165e-02 3.677e-03 -19.487 < 2e-16 ***
## ws -7.086e-02 4.333e-04 -163.531 < 2e-16 ***
## downwind_ref 1.444e-01 4.136e-03 34.917 < 2e-16 ***
## downwind_wrp -1.370e-02 4.575e-03 -2.993 0.00276 **
## I(1/MinDist^2) 4.732e-05 9.908e-07 47.760 < 2e-16 ***
## dist_wrp 5.576e-04 6.067e-06 91.915 < 2e-16 ***
## capacity 4.449e-03 4.555e-05 97.666 < 2e-16 ***
## Converted_Angle -2.954e-03 1.232e-04 -23.966 < 2e-16 ***
## monthly_oil_1km 5.172e-05 2.488e-06 20.787 < 2e-16 ***
## monthly_gas_1km 2.546e-04 1.226e-05 20.767 < 2e-16 ***
## active_1km -2.863e-02 1.076e-03 -26.604 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.998 8.000 988.2 <2e-16 ***
## s(latitude.x,longitude.x) 8.003 8.004 5198.0 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 36/38
## R-sq.(adj) = 0.162 Deviance explained = 16.2%
## GCV = 0.90017 Scale est. = 0.90012 n = 711593
plot(h2s_model_d)


# Include elvation, EVI, 3D smooth, odor
h2s_model_e <- gam(H2S ~
s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
wd_sec + ws + downwind_ref + downwind_wrp +
I(1/MinDist^2) + I(1/dist_wrp^2) + capacity + wrp_angle +
Converted_Angle + elevation + EVI +
s(latitude.x, longitude.x, bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
num_odor_complaints
,
data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_e)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) +
## wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) +
## I(1/dist_wrp^2) + capacity + wrp_angle + Converted_Angle +
## elevation + EVI + s(latitude.x, longitude.x, bs = "tp", k = 10) +
## te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km +
## monthly_gas_1km + active_1km + num_odor_complaints
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.614e+00 2.308e-01 15.658 < 2e-16 ***
## year2023 2.185e-02 1.754e-02 1.246 0.213
## as.factor(weekday).L 1.076e-01 2.862e-03 37.580 < 2e-16 ***
## as.factor(weekday).Q -1.769e-01 2.869e-03 -61.675 < 2e-16 ***
## as.factor(weekday).C 4.129e-03 2.848e-03 1.450 0.147
## as.factor(weekday)^4 -1.431e-02 2.848e-03 -5.025 5.03e-07 ***
## as.factor(weekday)^5 -6.237e-02 2.840e-03 -21.958 < 2e-16 ***
## as.factor(weekday)^6 -2.703e-02 2.855e-03 -9.470 < 2e-16 ***
## wd_secQ2 -1.355e-01 3.770e-03 -35.952 < 2e-16 ***
## wd_secQ3 -1.676e-01 4.006e-03 -41.831 < 2e-16 ***
## wd_secQ4 -9.811e-02 3.529e-03 -27.803 < 2e-16 ***
## ws -6.682e-02 4.167e-04 -160.365 < 2e-16 ***
## downwind_ref 1.107e-01 3.971e-03 27.886 < 2e-16 ***
## downwind_wrp -4.663e-03 4.373e-03 -1.066 0.286
## I(1/MinDist^2) -5.018e-07 7.849e-06 -0.064 0.949
## I(1/dist_wrp^2) 1.899e-06 3.608e-07 5.264 1.41e-07 ***
## capacity 1.108e-02 1.242e-03 8.921 < 2e-16 ***
## wrp_angle 3.887e-03 2.075e-04 18.732 < 2e-16 ***
## Converted_Angle -2.337e-02 2.559e-03 -9.133 < 2e-16 ***
## elevation -2.407e-01 1.314e-02 -18.328 < 2e-16 ***
## EVI 3.454e+00 2.639e-01 13.088 < 2e-16 ***
## monthly_oil_1km 5.898e-05 4.933e-06 11.956 < 2e-16 ***
## monthly_gas_1km -3.534e-05 2.793e-05 -1.265 0.206
## active_1km -1.808e-02 1.844e-03 -9.805 < 2e-16 ***
## num_odor_complaints 3.483e-02 1.936e-03 17.992 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.992 8.000 383.3
## s(latitude.x,longitude.x) 1.678 1.678 126.8
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 81.434 81.435 860.7
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(latitude.x,longitude.x) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 120/131
## R-sq.(adj) = 0.239 Deviance explained = 23.9%
## GCV = 0.81794 Scale est. = 0.81781 n = 711593
plot(h2s_model_e)



gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 5673794 303.1 8396474 448.5 8396474 448.5
## Vcells 518362147 3954.8 1529350983 11668.1 1382492477 10547.6
# Model only the month of the event, October 2021
h2s_model_f <- gam(H2S ~ wd_sec + ws + downwind_ref + downwind_wrp +
I(1/MinDist^2) + I(1/dist_wrp^2) + capacity + wrp_angle +
Converted_Angle + elevation + EVI +
s(latitude.x, longitude.x, bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
num_odor_complaints
,
data = full_data %>% filter(year == '2021', month == '10'))
summary(h2s_model_f)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) +
## I(1/dist_wrp^2) + capacity + wrp_angle + Converted_Angle +
## elevation + EVI + s(latitude.x, longitude.x, bs = "tp", k = 10) +
## te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km +
## monthly_gas_1km + active_1km + num_odor_complaints
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.456e-07 3.981e-07 2.375 0.017549 *
## wd_secQ2 3.217e+00 9.490e-01 3.390 0.000699 ***
## wd_secQ3 6.650e+00 9.427e-01 7.054 1.75e-12 ***
## wd_secQ4 8.045e+00 8.902e-01 9.038 < 2e-16 ***
## ws -2.347e+00 1.158e-01 -20.262 < 2e-16 ***
## downwind_ref 1.771e+00 8.597e-01 2.060 0.039407 *
## downwind_wrp 1.307e+00 1.077e+00 1.214 0.224903
## I(1/MinDist^2) -3.506e-03 1.585e-03 -2.212 0.026960 *
## I(1/dist_wrp^2) -1.201e-03 8.099e-05 -14.835 < 2e-16 ***
## capacity 7.376e-01 1.505e-01 4.899 9.63e-07 ***
## wrp_angle 1.019e+00 1.659e-01 6.146 8.00e-10 ***
## Converted_Angle -2.272e+00 2.881e-01 -7.885 3.19e-15 ***
## elevation -2.755e+01 2.379e+00 -11.581 < 2e-16 ***
## EVI 9.445e+02 1.053e+02 8.973 < 2e-16 ***
## monthly_oil_1km 1.626e-02 7.109e-03 2.287 0.022180 *
## monthly_gas_1km 2.391e-03 1.057e-03 2.263 0.023612 *
## active_1km 4.054e-05 1.846e-05 2.196 0.028071 *
## num_odor_complaints 7.443e-01 6.676e-02 11.150 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(latitude.x,longitude.x) 1.638 1.638 2.823
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 84.676 84.948 218.300
## p-value
## s(latitude.x,longitude.x) 0.173
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 106/116
## R-sq.(adj) = 0.263 Deviance explained = 26.4%
## GCV = 5458.6 Scale est. = 5451.7 n = 77510
plot(h2s_model_f)


gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 5674648 303.1 8396474 448.5 8396474 448.5
## Vcells 521886125 3981.7 1529350983 11668.1 1382492477 10547.6
# Model data excluding the month of the event, October 2021
h2s_model_g <- gam(H2S ~ s(as.numeric(month),bs='cc') + year +
wd_sec + ws + downwind_ref + downwind_wrp +
I(1/MinDist^2) + I(1/dist_wrp^2) + capacity + wrp_angle +
Converted_Angle + elevation + EVI +
s(latitude.x, longitude.x, bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
num_odor_complaints
,
data = full_data %>% filter(year != '2021', month != '10'))
summary(h2s_model_g)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws +
## downwind_ref + downwind_wrp + I(1/MinDist^2) + I(1/dist_wrp^2) +
## capacity + wrp_angle + Converted_Angle + elevation + EVI +
## s(latitude.x, longitude.x, bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + monthly_oil_1km + monthly_gas_1km +
## active_1km + num_odor_complaints
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.338e-01 1.222e-01 6.005 1.92e-09 ***
## year2022 -2.206e+00 1.614e-01 -13.669 < 2e-16 ***
## year2023 -2.316e+00 1.626e-01 -14.240 < 2e-16 ***
## wd_secQ2 -1.284e-01 2.891e-03 -44.408 < 2e-16 ***
## wd_secQ3 -2.125e-01 2.908e-03 -73.061 < 2e-16 ***
## wd_secQ4 -1.376e-01 2.742e-03 -50.176 < 2e-16 ***
## ws -5.836e-02 3.157e-04 -184.875 < 2e-16 ***
## downwind_ref 4.704e-02 2.601e-03 18.085 < 2e-16 ***
## downwind_wrp 3.566e-02 3.303e-03 10.795 < 2e-16 ***
## I(1/MinDist^2) -1.111e-04 2.839e-06 -39.129 < 2e-16 ***
## I(1/dist_wrp^2) 2.113e-05 5.665e-07 37.307 < 2e-16 ***
## capacity 2.165e-02 2.506e-04 86.397 < 2e-16 ***
## wrp_angle -9.541e-03 2.403e-04 -39.710 < 2e-16 ***
## Converted_Angle -1.446e-02 3.410e-04 -42.401 < 2e-16 ***
## elevation 1.177e-01 3.917e-03 30.062 < 2e-16 ***
## EVI -5.429e+00 1.549e-01 -35.053 < 2e-16 ***
## monthly_oil_1km -5.368e-05 2.183e-06 -24.587 < 2e-16 ***
## monthly_gas_1km -1.204e-03 2.359e-05 -51.012 < 2e-16 ***
## active_1km -3.450e-03 1.098e-03 -3.141 0.00168 **
## num_odor_complaints 4.586e-02 1.469e-03 31.218 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.999 8.000 1188
## s(latitude.x,longitude.x) 1.606 1.606 4394
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 72.495 72.496 2308
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(latitude.x,longitude.x) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 106/126
## R-sq.(adj) = 0.273 Deviance explained = 27.3%
## GCV = 0.7076 Scale est. = 0.70754 n = 1028486
plot(h2s_model_g)



gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 5676308 303.2 8396474 448.5 8396474 448.5
## Vcells 570433698 4352.1 1835301179 14002.3 1782965375 13603.0
Daily max H2S
# With month, year, wind sector/speed, dist to refinery, refinery
h2s_dm_model_a <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+year+wd_avg+ws_avg+
I(1/MinDist^2),
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_a)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + wd_avg +
## ws_avg + I(1/MinDist^2)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.062e+00 3.468e-01 8.829 <2e-16 ***
## year2023 -4.367e-01 2.197e-01 -1.988 0.0469 *
## wd_avg 5.555e-04 1.031e-03 0.539 0.5903
## ws_avg -8.938e-02 6.342e-02 -1.409 0.1588
## I(1/MinDist^2) 7.943e-07 8.452e-08 9.399 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 2.171 8 2.002 7e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 12/13
## R-sq.(adj) = 0.00868 Deviance explained = 1.06%
## GCV = 21.458 Scale est. = 21.408 n = 2664
plot(h2s_dm_model_a)

# Also with location of monitor
h2s_dm_model_b <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+wd_avg+ws_avg+
I(1/MinDist^2)+Refinery+s(monitor_lat, monitor_lon, bs='tp', k=10),
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_b)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + wd_avg + ws_avg +
## I(1/MinDist^2) + Refinery + s(monitor_lat, monitor_lon, bs = "tp",
## k = 10)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.085e+00 1.486e+01 0.612 0.54088
## wd_avg 2.922e-03 1.008e-03 2.900 0.00376 **
## ws_avg -3.126e-01 6.339e-02 -4.931 8.7e-07 ***
## I(1/MinDist^2) -1.612e-05 1.834e-03 -0.009 0.99299
## RefineryMarathon (Carson) -2.861e-01 1.840e+01 -0.016 0.98759
## RefineryMarathon (Wilmington) -3.058e+00 1.820e+01 -0.168 0.86659
## RefineryPhillips 66 (Wilmington) -1.378e+01 1.702e+01 -0.809 0.41840
## RefineryTorrance Refinery -2.545e+00 1.470e+01 -0.173 0.86261
## RefineryValero Refinery -6.947e+00 1.842e+01 -0.377 0.70604
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 2.026 8.000 1.698 0.000225 ***
## s(monitor_lat,monitor_lon) 5.408 5.699 9.292 6.79e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 25/26
## R-sq.(adj) = 0.127 Deviance explained = 13.1%
## GCV = 18.972 Scale est. = 18.862 n = 2664
plot(h2s_dm_model_b)


# Also include angle to refinery
h2s_dm_model_c <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+wd_avg+ws_avg+
I(1/MinDist^2)+Converted_Angle+
s(monitor_lat, monitor_lon, bs='tp', k=10),
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_c)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + wd_avg + ws_avg +
## I(1/MinDist^2) + Converted_Angle + s(monitor_lat, monitor_lon,
## bs = "tp", k = 10)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.630e+00 8.823e-01 4.114 4e-05 ***
## wd_avg 2.807e-03 1.012e-03 2.775 0.00555 **
## ws_avg -2.243e-01 6.210e-02 -3.612 0.00031 ***
## I(1/MinDist^2) -8.554e-06 3.810e-05 -0.225 0.82237
## Converted_Angle -3.393e-03 3.891e-03 -0.872 0.38337
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 2.103 8.000 2.015 5.44e-05 ***
## s(monitor_lat,monitor_lon) 7.070 7.918 41.050 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 21/22
## R-sq.(adj) = 0.116 Deviance explained = 12.1%
## GCV = 19.175 Scale est. = 19.08 n = 2664
plot(h2s_dm_model_c)


h2s_dm_model_d <- gam(H2S_daily_max ~
s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
wd_avg + ws_avg + daily_downwind_ref +
I(1/MinDist^2) + dist_wrp + capacity +
Converted_Angle +
s(monitor_lat, monitor_lon, bs='tp', k = 10) +
monthly_oil_1km + monthly_gas_1km + active_1km
,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_d)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + I(1/MinDist^2) + dist_wrp +
## capacity + Converted_Angle + s(monitor_lat, monitor_lon,
## bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km +
## active_1km
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.172e+01 4.645e+00 -2.524 0.011678 *
## year2023 -4.292e-01 4.437e-01 -0.967 0.333507
## as.factor(weekday).L 4.648e-01 2.421e-01 1.920 0.054935 .
## as.factor(weekday).Q -9.271e-01 2.424e-01 -3.825 0.000134 ***
## as.factor(weekday).C -1.117e-02 2.406e-01 -0.046 0.962981
## as.factor(weekday)^4 -1.140e-01 2.401e-01 -0.475 0.635008
## as.factor(weekday)^5 -3.636e-01 2.393e-01 -1.519 0.128806
## as.factor(weekday)^6 -1.897e-01 2.397e-01 -0.791 0.428742
## wd_avg 3.774e-03 1.102e-03 3.426 0.000624 ***
## ws_avg -3.508e-01 6.988e-02 -5.020 5.54e-07 ***
## daily_downwind_ref 8.150e-01 4.117e-01 1.980 0.047827 *
## I(1/MinDist^2) 1.228e-04 4.155e-04 0.296 0.767571
## dist_wrp 2.170e-03 7.111e-04 3.051 0.002304 **
## capacity 9.117e-03 1.247e-02 0.731 0.464918
## Converted_Angle 6.839e-03 9.438e-03 0.725 0.468756
## monthly_oil_1km -3.487e-05 1.292e-04 -0.270 0.787354
## monthly_gas_1km -2.724e-04 5.929e-04 -0.459 0.645948
## active_1km 5.170e-03 4.278e-02 0.121 0.903816
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 1.729 8.000 0.976 0.00459 **
## s(monitor_lat,monitor_lon) 7.783 7.975 13.054 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 34/35
## R-sq.(adj) = 0.137 Deviance explained = 14.6%
## GCV = 20.188 Scale est. = 19.967 n = 2428
plot(h2s_dm_model_d)


# Try to include as many variables as possible
h2s_dm_model_e <- gam(H2S_daily_max ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_e)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
## s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) +
## te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km +
## monthly_gas_1km + active_1km + daily_downwind_wrp + elevation +
## EVI + num_odor_complaints
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.666e+00 1.815e+00 3.672 0.000246 ***
## year2023 -4.106e-01 4.443e-01 -0.924 0.355507
## as.character(weekday)Mon -6.843e-01 3.373e-01 -2.029 0.042598 *
## as.character(weekday)Sat -7.572e-01 3.433e-01 -2.206 0.027510 *
## as.character(weekday)Sun -1.192e+00 3.413e-01 -3.494 0.000485 ***
## as.character(weekday)Thu -3.588e-01 3.417e-01 -1.050 0.293756
## as.character(weekday)Tue -1.363e-01 3.363e-01 -0.405 0.685185
## as.character(weekday)Wed 2.723e-02 3.407e-01 0.080 0.936305
## wd_avg 3.625e-03 1.105e-03 3.279 0.001057 **
## ws_avg -3.414e-01 6.961e-02 -4.904 1.00e-06 ***
## daily_downwind_ref 7.746e-01 4.071e-01 1.903 0.057179 .
## I(1/dist_wrp^2) -1.882e-06 2.943e-07 -6.394 1.93e-10 ***
## monthly_oil_1km -4.350e-05 1.272e-04 -0.342 0.732462
## monthly_gas_1km -2.489e-04 6.016e-04 -0.414 0.679122
## active_1km 1.291e-02 4.168e-02 0.310 0.756751
## daily_downwind_wrp 1.454e-01 4.311e-01 0.337 0.735989
## elevation -1.143e-01 6.418e-02 -1.781 0.075053 .
## EVI -3.427e+00 7.699e-01 -4.452 8.90e-06 ***
## num_odor_complaints 1.615e-01 1.555e-01 1.038 0.299280
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 1.085 8.000 0.413
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.159 8.756 14.556
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 3.089 77.000 0.090
## p-value
## s(as.numeric(month)) 0.0156 *
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 0.0122 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 112/113
## R-sq.(adj) = 0.139 Deviance explained = 14.9%
## GCV = 20.176 Scale est. = 19.924 n = 2428
plot(h2s_dm_model_e)



e <- getViz(h2s_dm_model_e)
plot(sm(e, 2)) + l_fitRaster() + l_fitContour() + l_points()

# plotRGL(sm(e, 3), fix = c(`as.numeric(day)` = 0), residuals = F)
#
# # try a plot using wireframe
# plot3d <- matrix(fitted(h2s_dm_model_e), ncol = 20)
# wireframe(plot3d, drape=TRUE, colorkey=TRUE)
h2s_dm_model_f <- gam(H2S_daily_max ~ as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints,
data = daily_full %>% filter(year == '2021', month == '10'))
summary(h2s_dm_model_f)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ as.character(weekday) + wd_avg + ws_avg + daily_downwind_ref +
## I(1/dist_wrp^2) + s(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp",
## "cc")) + monthly_oil_1km + monthly_gas_1km + active_1km +
## daily_downwind_wrp + elevation + EVI + num_odor_complaints
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.155e-06 1.228e-06 5.013 1.01e-06 ***
## as.character(weekday)Mon 1.349e+00 9.098e+01 0.015 0.988183
## as.character(weekday)Sat -1.910e+01 7.183e+01 -0.266 0.790587
## as.character(weekday)Sun -2.350e+01 7.184e+01 -0.327 0.743844
## as.character(weekday)Thu 1.712e+01 7.449e+01 0.230 0.818387
## as.character(weekday)Tue -5.373e+01 7.929e+01 -0.678 0.498642
## as.character(weekday)Wed 5.286e+00 7.495e+01 0.071 0.943831
## wd_avg -1.782e-01 3.223e-01 -0.553 0.580948
## ws_avg -5.224e+00 3.576e+01 -0.146 0.883962
## daily_downwind_ref -1.557e+02 9.324e+01 -1.670 0.096196 .
## I(1/dist_wrp^2) -4.680e-04 1.258e-04 -3.720 0.000246 ***
## monthly_oil_1km 1.095e-01 2.184e-02 5.013 1.01e-06 ***
## monthly_gas_1km 1.626e-02 3.242e-03 5.013 1.01e-06 ***
## active_1km 2.831e-04 5.648e-05 5.013 1.01e-06 ***
## daily_downwind_wrp -3.084e+01 1.189e+02 -0.259 0.795605
## elevation -9.357e+01 1.750e+01 -5.345 2.03e-07 ***
## EVI 8.195e+01 2.217e+02 0.370 0.711942
## num_odor_complaints 9.092e-01 1.619e+00 0.562 0.574839
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.793e+00 8.984 15.11
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 1.611e-07 80.000 0.00
## p-value
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 0.901
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 103/107
## R-sq.(adj) = 0.453 Deviance explained = 49.6%
## GCV = 1.2069e+05 Scale est. = 1.1065e+05 n = 274
plot(h2s_dm_model_f)


f <- getViz(h2s_dm_model_f)
plot(sm(f, 1)) + l_fitRaster() + l_fitContour() + l_points()

h2s_dm_model_g <- gam(H2S_daily_max ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints,
data = daily_full %>% filter(year != '2021', month != '10'))
summary(h2s_dm_model_g)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
## s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) +
## te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km +
## monthly_gas_1km + active_1km + daily_downwind_wrp + elevation +
## EVI + num_odor_complaints
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.479e+00 2.085e+00 3.587 0.00034 ***
## year2022 -2.703e-01 8.084e-01 -0.334 0.73814
## year2023 -6.711e-01 9.060e-01 -0.741 0.45896
## as.character(weekday)Mon -7.542e-01 2.311e-01 -3.264 0.00111 **
## as.character(weekday)Sat -7.661e-01 2.345e-01 -3.267 0.00110 **
## as.character(weekday)Sun -1.090e+00 2.335e-01 -4.669 3.14e-06 ***
## as.character(weekday)Thu -5.043e-01 2.332e-01 -2.163 0.03063 *
## as.character(weekday)Tue -4.176e-01 2.309e-01 -1.808 0.07062 .
## as.character(weekday)Wed -2.204e-01 2.331e-01 -0.945 0.34452
## wd_avg 2.271e-03 8.069e-04 2.815 0.00491 **
## ws_avg -3.303e-01 4.852e-02 -6.807 1.17e-11 ***
## daily_downwind_ref 3.628e-01 2.234e-01 1.624 0.10438
## I(1/dist_wrp^2) -3.288e-06 4.143e-07 -7.936 2.79e-15 ***
## monthly_oil_1km -8.192e-05 9.291e-05 -0.882 0.37799
## monthly_gas_1km 3.606e-04 4.843e-04 0.745 0.45659
## active_1km 1.483e-02 3.016e-02 0.492 0.62288
## daily_downwind_wrp 2.541e-01 2.843e-01 0.894 0.37149
## elevation -1.432e-01 5.810e-02 -2.464 0.01378 *
## EVI -3.290e+00 6.402e-01 -5.139 2.92e-07 ***
## num_odor_complaints 2.533e-01 1.041e-01 2.432 0.01505 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 1.218 8.000 0.307
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.848 8.955 18.646
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 12.632 66.000 0.564
## p-value
## s(as.numeric(month)) 0.0385 *
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 1.77e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 102/103
## R-sq.(adj) = 0.159 Deviance explained = 16.9%
## GCV = 13.758 Scale est. = 13.596 n = 3540
plot(h2s_dm_model_g)



g <- getViz(h2s_dm_model_g)
plot(sm(g, 2)) + l_fitRaster() + l_fitContour() + l_points()

Daily Average
# Look at daily average
h2s_da_model_a <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
wd_avg + ws_avg + daily_downwind_ref +
I(1/MinDist^2) + I(1/dist_wrp^2) + capacity +
Converted_Angle +
s(monitor_lon, monitor_lat, bs='tp', k = 10) +
monthly_oil_1km + monthly_gas_1km + active_1km,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_a)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + I(1/MinDist^2) + I(1/dist_wrp^2) +
## capacity + Converted_Angle + s(monitor_lon, monitor_lat,
## bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km +
## active_1km
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.312e+00 6.250e-01 -8.499 < 2e-16 ***
## year2023 -1.285e-01 6.043e-02 -2.127 0.03355 *
## as.factor(weekday).L 8.010e-02 2.546e-02 3.146 0.00168 **
## as.factor(weekday).Q -1.702e-01 2.547e-02 -6.682 2.93e-11 ***
## as.factor(weekday).C 2.449e-02 2.527e-02 0.969 0.33268
## as.factor(weekday)^4 3.029e-04 2.521e-02 0.012 0.99042
## as.factor(weekday)^5 -5.445e-02 2.514e-02 -2.166 0.03040 *
## as.factor(weekday)^6 -5.067e-02 2.517e-02 -2.013 0.04421 *
## wd_avg 8.357e-04 1.162e-04 7.192 8.49e-13 ***
## ws_avg -1.196e-01 7.590e-03 -15.758 < 2e-16 ***
## daily_downwind_ref 2.886e-01 4.339e-02 6.651 3.59e-11 ***
## I(1/MinDist^2) 3.248e-04 3.115e-05 10.426 < 2e-16 ***
## I(1/dist_wrp^2) -1.769e-05 2.306e-06 -7.670 2.49e-14 ***
## capacity 1.673e-02 1.210e-03 13.835 < 2e-16 ***
## Converted_Angle -2.682e-03 1.048e-03 -2.558 0.01058 *
## monthly_oil_1km 4.360e-05 1.992e-05 2.189 0.02872 *
## monthly_gas_1km 1.888e-04 1.009e-04 1.870 0.06156 .
## active_1km -2.076e-02 8.581e-03 -2.419 0.01563 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.485 8 9.614 <2e-16 ***
## s(monitor_lon,monitor_lat) 8.987 9 73.350 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 33/35
## R-sq.(adj) = 0.396 Deviance explained = 40.3%
## GCV = 0.22287 Scale est. = 0.21988 n = 2428
plot(h2s_da_model_a)


a <- getViz(h2s_da_model_a)
plot(sm(a, 2)) + l_fitRaster() + l_fitContour() + l_points()

# Look at daily average
h2s_da_model_b <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
wd_avg + ws_avg + daily_downwind_ref +
I(1/MinDist^2) + I(1/dist_wrp^2) + capacity +
Converted_Angle +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
monthly_oil_1km + monthly_gas_1km + active_1km,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_b)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + I(1/MinDist^2) + I(1/dist_wrp^2) +
## capacity + Converted_Angle + s(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km +
## active_1km
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.724e+00 5.238e-01 -7.109 1.54e-12 ***
## year2023 -1.286e-01 6.043e-02 -2.128 0.03346 *
## as.factor(weekday).L 8.010e-02 2.546e-02 3.146 0.00168 **
## as.factor(weekday).Q -1.702e-01 2.547e-02 -6.681 2.94e-11 ***
## as.factor(weekday).C 2.449e-02 2.527e-02 0.969 0.33269
## as.factor(weekday)^4 2.901e-04 2.521e-02 0.012 0.99082
## as.factor(weekday)^5 -5.444e-02 2.514e-02 -2.166 0.03043 *
## as.factor(weekday)^6 -5.066e-02 2.517e-02 -2.013 0.04426 *
## wd_avg 8.356e-04 1.162e-04 7.192 8.52e-13 ***
## ws_avg -1.195e-01 7.589e-03 -15.752 < 2e-16 ***
## daily_downwind_ref 2.886e-01 4.339e-02 6.652 3.58e-11 ***
## I(1/MinDist^2) 1.481e-04 1.546e-05 9.582 < 2e-16 ***
## I(1/dist_wrp^2) -1.109e-05 1.108e-06 -10.016 < 2e-16 ***
## capacity 1.276e-02 9.036e-04 14.122 < 2e-16 ***
## Converted_Angle -2.550e-03 1.030e-03 -2.476 0.01335 *
## monthly_oil_1km 4.361e-05 1.992e-05 2.189 0.02870 *
## monthly_gas_1km 1.888e-04 1.009e-04 1.871 0.06145 .
## active_1km -2.077e-02 8.580e-03 -2.420 0.01559 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.484 8 9.618 <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.980 9 73.203 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 33/35
## R-sq.(adj) = 0.396 Deviance explained = 40.3%
## GCV = 0.22286 Scale est. = 0.21988 n = 2428
plot(h2s_da_model_b)


b <- getViz(h2s_da_model_b)
plot(sm(b, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_c <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
wd_avg + ws_avg + daily_downwind_ref +
I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
monthly_oil_1km + monthly_gas_1km + active_1km,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_c)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
## s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) +
## monthly_oil_1km + monthly_gas_1km + active_1km
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.103e-01 2.439e-01 3.733 0.000194 ***
## year2023 -1.563e-01 6.337e-02 -2.467 0.013688 *
## as.factor(weekday).L 8.209e-02 2.670e-02 3.075 0.002128 **
## as.factor(weekday).Q -1.670e-01 2.670e-02 -6.253 4.74e-10 ***
## as.factor(weekday).C 2.391e-02 2.650e-02 0.902 0.366948
## as.factor(weekday)^4 -5.662e-03 2.643e-02 -0.214 0.830409
## as.factor(weekday)^5 -4.929e-02 2.635e-02 -1.870 0.061551 .
## as.factor(weekday)^6 -4.562e-02 2.639e-02 -1.729 0.083972 .
## wd_avg 7.589e-04 1.217e-04 6.237 5.26e-10 ***
## ws_avg -9.445e-02 7.733e-03 -12.215 < 2e-16 ***
## daily_downwind_ref 2.650e-01 4.535e-02 5.843 5.84e-09 ***
## I(1/dist_wrp^2) 2.316e-07 5.103e-08 4.539 5.94e-06 ***
## monthly_oil_1km 4.501e-05 2.092e-05 2.152 0.031505 *
## monthly_gas_1km 2.173e-04 1.059e-04 2.053 0.040221 *
## active_1km -2.346e-02 9.008e-03 -2.604 0.009273 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.501 8.000 10.11 <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.905 8.997 76.62 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 31/32
## R-sq.(adj) = 0.336 Deviance explained = 34.4%
## GCV = 0.24477 Scale est. = 0.2417 n = 2428
plot(h2s_da_model_c)


c <- getViz(h2s_da_model_c)
plot(sm(c, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_d <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref +
I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_d)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
## s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) +
## monthly_oil_1km + monthly_gas_1km + active_1km + daily_downwind_wrp +
## elevation + EVI
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.527e+00 2.618e-01 5.834 6.14e-09 ***
## year2023 -1.295e-01 6.042e-02 -2.143 0.03219 *
## as.character(weekday)Mon -8.859e-02 3.551e-02 -2.495 0.01267 *
## as.character(weekday)Sat -9.877e-02 3.608e-02 -2.737 0.00624 **
## as.character(weekday)Sun -1.985e-01 3.588e-02 -5.532 3.51e-08 ***
## as.character(weekday)Thu -4.873e-02 3.591e-02 -1.357 0.17490
## as.character(weekday)Tue 1.210e-03 3.539e-02 0.034 0.97273
## as.character(weekday)Wed 5.394e-02 3.584e-02 1.505 0.13248
## wd_avg 8.351e-04 1.162e-04 7.188 8.74e-13 ***
## ws_avg -1.200e-01 7.593e-03 -15.804 < 2e-16 ***
## daily_downwind_ref 2.897e-01 4.334e-02 6.684 2.89e-11 ***
## I(1/dist_wrp^2) -2.132e-07 3.781e-08 -5.640 1.90e-08 ***
## monthly_oil_1km 4.348e-05 1.991e-05 2.184 0.02907 *
## monthly_gas_1km 1.905e-04 1.009e-04 1.888 0.05920 .
## active_1km -2.088e-02 8.576e-03 -2.434 0.01499 *
## daily_downwind_wrp 5.124e-02 4.532e-02 1.131 0.25835
## elevation -1.933e-02 7.594e-03 -2.545 0.01099 *
## EVI -1.223e+00 8.124e-02 -15.055 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.479 8.000 9.638 <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.912 8.997 47.632 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 34/35
## R-sq.(adj) = 0.396 Deviance explained = 40.4%
## GCV = 0.22293 Scale est. = 0.21987 n = 2428
plot(h2s_da_model_d)


d <- getViz(h2s_da_model_d)
plot(sm(d, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_e <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref +
I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_e)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
## s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) +
## te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km +
## monthly_gas_1km + active_1km + daily_downwind_wrp + elevation +
## EVI
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.024e+00 5.182e-01 3.906 9.65e-05 ***
## year2023 1.875e-01 1.458e-01 1.286 0.19860
## as.character(weekday)Mon -8.906e-02 2.900e-02 -3.071 0.00216 **
## as.character(weekday)Sat -9.560e-02 2.943e-02 -3.248 0.00118 **
## as.character(weekday)Sun -1.987e-01 2.930e-02 -6.781 1.51e-11 ***
## as.character(weekday)Thu -4.744e-02 2.934e-02 -1.617 0.10600
## as.character(weekday)Tue 1.461e-03 2.895e-02 0.050 0.95974
## as.character(weekday)Wed 4.602e-02 2.926e-02 1.573 0.11589
## wd_avg 6.597e-04 9.982e-05 6.609 4.79e-11 ***
## ws_avg -1.106e-01 6.455e-03 -17.141 < 2e-16 ***
## daily_downwind_ref 2.198e-01 3.605e-02 6.097 1.26e-09 ***
## I(1/dist_wrp^2) 4.334e-08 1.194e-07 0.363 0.71674
## monthly_oil_1km 4.255e-05 3.307e-05 1.287 0.19831
## monthly_gas_1km 6.281e-05 1.981e-04 0.317 0.75119
## active_1km -9.381e-03 1.272e-02 -0.737 0.46107
## daily_downwind_wrp 5.237e-02 3.768e-02 1.390 0.16472
## elevation -6.505e-02 1.038e-02 -6.267 4.38e-10 ***
## EVI -1.618e+00 1.281e-01 -12.632 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.524 8 4.006
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000 9 21.163
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 75.659 77 16.637
## p-value
## s(as.numeric(month)) 4.59e-05 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 111/112
## R-sq.(adj) = 0.599 Deviance explained = 61.7%
## GCV = 0.15275 Scale est. = 0.14588 n = 2428
plot(h2s_da_model_e)



e <- getViz(h2s_da_model_e)
plot(sm(e, 2)) + l_fitRaster() + l_fitContour() + l_points()

# plotRGL(sm(d, 3), fix = c(`as.numeric(day)` = 0), residuals = F)
#
# # try a plot using wireframe
# plot3d <- matrix(fitted(h2s_da_model_e), ncol = 20)
# wireframe(plot3d, drape=TRUE, colorkey=TRUE)
h2s_da_model_f <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref +
I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_f)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
## s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) +
## te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km +
## monthly_gas_1km + active_1km + daily_downwind_wrp + elevation +
## EVI + num_odor_complaints
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.039e+00 5.177e-01 3.940 8.40e-05 ***
## year2023 1.934e-01 1.457e-01 1.328 0.18446
## as.character(weekday)Mon -8.896e-02 2.898e-02 -3.070 0.00217 **
## as.character(weekday)Sat -9.447e-02 2.941e-02 -3.212 0.00134 **
## as.character(weekday)Sun -1.958e-01 2.930e-02 -6.680 2.98e-11 ***
## as.character(weekday)Thu -4.941e-02 2.933e-02 -1.685 0.09216 .
## as.character(weekday)Tue 1.885e-03 2.892e-02 0.065 0.94803
## as.character(weekday)Wed 4.586e-02 2.923e-02 1.569 0.11684
## wd_avg 6.445e-04 9.999e-05 6.446 1.40e-10 ***
## ws_avg -1.101e-01 6.455e-03 -17.053 < 2e-16 ***
## daily_downwind_ref 2.206e-01 3.603e-02 6.124 1.07e-09 ***
## I(1/dist_wrp^2) 4.611e-08 1.193e-07 0.386 0.69926
## monthly_oil_1km 3.829e-05 3.308e-05 1.158 0.24706
## monthly_gas_1km 6.831e-05 1.979e-04 0.345 0.72996
## active_1km -8.711e-03 1.271e-02 -0.685 0.49333
## daily_downwind_wrp 5.350e-02 3.765e-02 1.421 0.15553
## elevation -6.481e-02 1.037e-02 -6.248 4.93e-10 ***
## EVI -1.610e+00 1.280e-01 -12.578 < 2e-16 ***
## num_odor_complaints 2.989e-02 1.375e-02 2.174 0.02980 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.517 8 4.042
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000 9 21.194
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 75.661 77 16.635
## p-value
## s(as.numeric(month)) 4.01e-05 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 112/113
## R-sq.(adj) = 0.6 Deviance explained = 61.8%
## GCV = 0.15257 Scale est. = 0.14565 n = 2428
plot(h2s_da_model_f)



f <- getViz(h2s_da_model_f)
plot(sm(e, 2)) + l_fitRaster() + l_fitContour() + l_points()

# plotRGL(sm(d, 3), fix = c(`as.numeric(day)` = 0), residuals = F)
#
# # try a plot using wireframe
# plot3d <- matrix(fitted(h2s_da_model_e), ncol = 20)
# wireframe(plot3d, drape=TRUE, colorkey=TRUE)
h2s_da_model_g <- gam(H2S_daily_avg~ as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref +
I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints,
data = daily_full %>% filter(year == '2021', month == '10'))
summary(h2s_da_model_g)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ as.character(weekday) + wd_avg + ws_avg + daily_downwind_ref +
## I(1/dist_wrp^2) + s(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp",
## "cc")) + monthly_oil_1km + monthly_gas_1km + active_1km +
## daily_downwind_wrp + elevation + EVI + num_odor_complaints
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.644e-07 1.379e-07 4.816 2.53e-06 ***
## as.character(weekday)Mon -7.118e+00 1.025e+01 -0.695 0.487976
## as.character(weekday)Sat -6.445e+00 8.092e+00 -0.796 0.426498
## as.character(weekday)Sun -8.848e+00 8.094e+00 -1.093 0.275368
## as.character(weekday)Thu 8.558e-02 8.391e+00 0.010 0.991871
## as.character(weekday)Tue -1.298e+01 8.933e+00 -1.453 0.147520
## as.character(weekday)Wed -3.811e+00 8.444e+00 -0.451 0.652146
## wd_avg -1.884e-02 3.631e-02 -0.519 0.604363
## ws_avg 3.440e-01 4.027e+00 0.085 0.931997
## daily_downwind_ref -1.605e+01 1.050e+01 -1.528 0.127716
## I(1/dist_wrp^2) -5.240e-05 1.408e-05 -3.720 0.000245 ***
## monthly_oil_1km 1.182e-02 2.454e-03 4.816 2.53e-06 ***
## monthly_gas_1km 1.755e-03 3.643e-04 4.816 2.53e-06 ***
## active_1km 3.056e-05 6.345e-06 4.816 2.53e-06 ***
## daily_downwind_wrp -1.092e+01 1.340e+01 -0.815 0.415823
## elevation -1.008e+01 1.966e+00 -5.129 5.83e-07 ***
## EVI 1.128e+01 2.497e+01 0.452 0.651847
## num_odor_complaints 2.407e-01 1.823e-01 1.321 0.187853
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.78e+00 8.982 12.66
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 1.18e-07 80.000 0.00
## p-value
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 0.902
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 103/107
## R-sq.(adj) = 0.428 Deviance explained = 47.4%
## GCV = 1531.7 Scale est. = 1404.4 n = 274
plot(h2s_da_model_g)


g <- getViz(h2s_da_model_g)
plot(sm(g, 1)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_h <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref +
I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints,
data = daily_full %>% filter(year != '2021', month != '10'))
summary(h2s_da_model_h)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
## s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) +
## te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km +
## monthly_gas_1km + active_1km + daily_downwind_wrp + elevation +
## EVI + num_odor_complaints
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.242e+00 1.108e+00 5.634 1.90e-08 ***
## year2022 -1.400e+00 1.329e+00 -1.053 0.292219
## year2023 -1.425e+00 1.342e+00 -1.062 0.288518
## as.character(weekday)Mon -1.141e-01 2.614e-02 -4.366 1.31e-05 ***
## as.character(weekday)Sat -1.281e-01 2.648e-02 -4.836 1.38e-06 ***
## as.character(weekday)Sun -1.978e-01 2.643e-02 -7.487 8.91e-14 ***
## as.character(weekday)Thu -5.216e-02 2.634e-02 -1.980 0.047761 *
## as.character(weekday)Tue -6.735e-02 2.612e-02 -2.579 0.009953 **
## as.character(weekday)Wed -1.623e-02 2.634e-02 -0.616 0.537705
## wd_avg 3.136e-04 9.337e-05 3.359 0.000792 ***
## ws_avg -1.054e-01 5.891e-03 -17.893 < 2e-16 ***
## daily_downwind_ref 3.858e-02 2.571e-02 1.501 0.133566
## I(1/dist_wrp^2) 3.957e-05 1.535e-05 2.578 0.009985 **
## monthly_oil_1km -4.282e-05 1.808e-05 -2.368 0.017934 *
## monthly_gas_1km -1.210e-03 1.926e-04 -6.281 3.78e-10 ***
## active_1km -2.150e-04 9.072e-03 -0.024 0.981094
## daily_downwind_wrp 6.052e-02 3.248e-02 1.863 0.062480 .
## elevation -2.879e-02 7.638e-03 -3.769 0.000166 ***
## EVI -1.581e+00 1.146e-01 -13.795 < 2e-16 ***
## num_odor_complaints 3.456e-02 1.227e-02 2.816 0.004887 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.911 8 15.33
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000 9 13.40
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 64.809 66 25.36
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 102/103
## R-sq.(adj) = 0.58 Deviance explained = 59.2%
## GCV = 0.17792 Scale est. = 0.17286 n = 3540
plot(h2s_da_model_h)



h <- getViz(h2s_da_model_h)
plot(sm(h, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_i <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref +
I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints + october2021,
data = daily_full %>% mutate(october2021 = if_else(year == '2021', month == '10', 1, 0)))
summary(h2s_da_model_i)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
## s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) +
## te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km +
## monthly_gas_1km + active_1km + daily_downwind_wrp + elevation +
## EVI + num_odor_complaints + october2021
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.859e+00 5.194e+00 0.358 0.72050
## year2021 4.910e+00 1.540e+00 3.188 0.00144 **
## year2022 4.982e-01 1.754e+00 0.284 0.77644
## year2023 2.934e-01 2.027e+00 0.145 0.88490
## as.character(weekday)Mon -3.154e-01 4.028e-01 -0.783 0.43361
## as.character(weekday)Sat -2.694e-01 4.025e-01 -0.669 0.50325
## as.character(weekday)Sun -3.207e-01 4.030e-01 -0.796 0.42616
## as.character(weekday)Thu -3.159e-01 4.033e-01 -0.783 0.43347
## as.character(weekday)Tue -6.353e-01 4.016e-01 -1.582 0.11367
## as.character(weekday)Wed -3.640e-01 4.039e-01 -0.901 0.36752
## wd_avg 1.701e-04 1.428e-03 0.119 0.90516
## ws_avg -1.220e-01 9.043e-02 -1.349 0.17748
## daily_downwind_ref 1.895e-02 4.006e-01 0.047 0.96227
## I(1/dist_wrp^2) -2.131e-06 4.452e-06 -0.479 0.63221
## monthly_oil_1km 8.383e-05 2.178e-04 0.385 0.70038
## monthly_gas_1km 1.844e-04 8.925e-04 0.207 0.83636
## active_1km 5.923e-02 7.735e-02 0.766 0.44381
## daily_downwind_wrp 3.054e-01 4.854e-01 0.629 0.52922
## elevation -4.989e-01 9.598e-02 -5.198 2.07e-07 ***
## EVI 3.145e-01 1.130e+00 0.278 0.78083
## num_odor_complaints 9.312e-01 2.918e-02 31.917 < 2e-16 ***
## october2021 4.716e+00 6.616e-01 7.129 1.12e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 1.134e-06 8.000 0.000
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 6.686e+00 7.115 3.714
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 6.533e+01 80.000 2.512
## p-value
## s(as.numeric(month)) 0.674508
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 0.000462 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 118/119
## R-sq.(adj) = 0.253 Deviance explained = 26.3%
## GCV = 79.302 Scale est. = 78.213 n = 6771
plot(h2s_da_model_i)



i <- getViz(h2s_da_model_i)
plot(sm(i, 2)) + l_fitRaster() + l_fitContour() + l_points()

knitr::kable(tibble(Model = c('Since Feb 2022',
'October 2021 Only',
'Exclude October 2021',
'October 2021 Indicator'),
'R-Sq' = c(round(summary(h2s_da_model_f)$r.sq, 2),
round(summary(h2s_da_model_g)$r.sq, 2),
round(summary(h2s_da_model_h)$r.sq, 2),
round(summary(h2s_da_model_i)$r.sq, 2))))
| Since Feb 2022 |
0.60 |
| October 2021 Only |
0.43 |
| Exclude October 2021 |
0.58 |
| October 2021 Indicator |
0.25 |
80/20 Cross Validation on Daily Average models
result <- tibble(Model = character(),
'80/20 Train R-Sq' = numeric(),
'80/20 Test R-Sq' = numeric())
predictors <- c('H2S_daily_avg', 'month', 'year', 'weekday', 'wd_avg', 'ws_avg', 'daily_downwind_ref', 'dist_wrp',
'mon_utm_x', 'mon_utm_y', 'day', 'monthly_oil_1km', 'monthly_gas_1km',
'active_1km', 'daily_downwind_wrp', 'elevation', 'EVI', 'num_odor_complaints')
set.seed(313)
# model 1: since Feb 2022
train <- daily_full %>%
filter(day >= '2022-02-01') %>%
select(all_of(predictors)) %>%
filter(complete.cases(.)) %>%
sample_frac(0.8)
test <- anti_join(daily_full %>%
filter(day >= '2022-02-01') %>%
select(all_of(predictors)) %>%
filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints)`
h2s_da_model_f2 <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints,
data = train)
predictions <- predict(h2s_da_model_f2, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f2)$n - 1)/
(summary(h2s_da_model_f2)$n - summary(h2s_da_model_f2)$np - 1)
result <- rbind(result, tibble(Model = 'Since Feb 2022',
'80/20 Train R-Sq' = summary(h2s_da_model_f2)$r.sq,
'80/20 Test R-Sq' = adj_r2_test))
# Model 2: October 2021 only
train <- daily_full %>%
filter(year == '2021', month == '10') %>%
select(all_of(predictors)) %>%
filter(complete.cases(.)) %>%
sample_frac(0.8)
test <- anti_join(daily_full %>%
filter(year == '2021', month == '10') %>%
select(all_of(predictors)) %>%
filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints)`
h2s_da_model_f3 <- gam(H2S_daily_avg~as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints,
data = train)
predictions <- predict(h2s_da_model_f3, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f3)$n - 1)/
(summary(h2s_da_model_f3)$n - summary(h2s_da_model_f3)$np - 1)
result <- rbind(result, tibble(Model = 'October 2021 Only',
'80/20 Train R-Sq' = summary(h2s_da_model_f3)$r.sq,
'80/20 Test R-Sq' = adj_r2_test))
# Model 3: Excluding October 2021
train <- daily_full %>%
filter(year != '2021', month != '10') %>%
select(all_of(predictors)) %>%
filter(complete.cases(.)) %>%
sample_frac(0.8)
test <- anti_join(daily_full %>%
filter(year != '2021', month != '10') %>%
select(all_of(predictors)) %>%
filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints)`
h2s_da_model_f4 <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints,
data = train)
predictions <- predict(h2s_da_model_f4, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f4)$n - 1)/
(summary(h2s_da_model_f4)$n - summary(h2s_da_model_f4)$np - 1)
result <- rbind(result, tibble(Model = 'Exclude October 2021',
'80/20 Train R-Sq' = summary(h2s_da_model_f4)$r.sq,
'80/20 Test R-Sq' = adj_r2_test))
# Model 4: October 2021 Indicator
train <- daily_full %>%
select(all_of(predictors)) %>%
mutate(october2021 = if_else(year == '2021', month == '10', 1, 0)) %>%
filter(complete.cases(.)) %>%
sample_frac(0.8)
test <- anti_join(daily_full %>%
select(all_of(predictors)) %>%
mutate(october2021 = if_else(year == '2021', month == '10', 1, 0)) %>%
filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, october2021)`
h2s_da_model_f5 <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
october2021,
data = train)
predictions <- predict(h2s_da_model_f5, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f5)$n - 1)/
(summary(h2s_da_model_f5)$n - summary(h2s_da_model_f5)$np - 1)
result <- rbind(result, tibble(Model = 'October 2021 Indicator',
'80/20 Train R-Sq' = summary(h2s_da_model_f5)$r.sq,
'80/20 Test R-Sq' = adj_r2_test))
knitr::kable(tibble(Model = c('Since Feb 2022',
'October 2021 Only',
'Exclude October 2021',
'October 2021 Indicator'),
'R-Sq' = c(round(summary(h2s_da_model_f)$r.sq, 2),
round(summary(h2s_da_model_g)$r.sq, 2),
round(summary(h2s_da_model_h)$r.sq, 2),
round(summary(h2s_da_model_i)$r.sq, 2))) %>%
left_join(result, join_by(Model)))
| Since Feb 2022 |
0.60 |
0.5798024 |
0.6412991 |
| October 2021 Only |
0.43 |
0.3960353 |
-0.0016982 |
| Exclude October 2021 |
0.58 |
0.5850159 |
0.5312950 |
| October 2021 Indicator |
0.25 |
0.2652137 |
0.5154652 |
Cross Validation on each monitor
result_cv <- tibble(Model = character(),
'CV Avg Train R-Sq' = numeric(),
'CV Avg Test R-Sq' = numeric())
# model 1: since Feb 2022
post2022feb <- daily_full %>%
filter(day >= '2022-02-01') %>%
select(all_of(predictors), Monitor) %>%
filter(complete.cases(.))
monitors <- unique(post2022feb$Monitor)
cv1_result <- tibble(Monitor = character(),
'Train Adj-R2' = numeric(),
'Train N' = numeric(),
'Test Adj-R2' = numeric(),
'Test B' = numeric()
)
for (i in 1:length(monitors)) {
train <- post2022feb %>%
filter(Monitor != monitors[i])
test <- post2022feb %>%
filter(Monitor == monitors[i])
h2s_da_model_f2b <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints,
data = train)
predictions <- predict(h2s_da_model_f2b, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f2b)$n - 1)/
(summary(h2s_da_model_f2b)$n - summary(h2s_da_model_f2b)$np - 1)
cv1_result <- rbind(cv1_result, tibble(monitors[i],
summary(h2s_da_model_f2b)$r.sq,
summary(h2s_da_model_f2b)$n,
adj_r2_test,
nrow(test)))
}
result_cv <- rbind(result_cv, tibble(tibble(Model = 'Since Feb 2022',
'CV Avg Train R-Sq' = mean(cv1_result$`summary(h2s_da_model_f2b)$r.sq`),
'CV Avg Test R-Sq' = mean(cv1_result$adj_r2_test))))
# model 2: October 2021 only
october2021 <- daily_full %>%
filter(year == '2021', month == '10') %>%
select(all_of(predictors), Monitor) %>%
filter(complete.cases(.))
monitors <- unique(october2021$Monitor)
cv2_result <- tibble(Monitor = character(),
'Train Adj-R2' = numeric(),
'Train N' = numeric(),
'Test Adj-R2' = numeric(),
'Test B' = numeric()
)
for (i in 1:length(monitors)) {
train <- october2021 %>%
filter(Monitor != monitors[i])
test <- october2021 %>%
filter(Monitor == monitors[i])
h2s_da_model_f2b <- gam(H2S_daily_avg~as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints,
data = train)
predictions <- predict(h2s_da_model_f2b, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f2b)$n - 1)/
(summary(h2s_da_model_f2b)$n - summary(h2s_da_model_f2b)$np - 1)
cv2_result <- rbind(cv2_result, tibble(monitors[i],
summary(h2s_da_model_f2b)$r.sq,
summary(h2s_da_model_f2b)$n,
adj_r2_test,
nrow(test)))
}
result_cv <- rbind(result_cv, tibble(tibble(Model = 'October 2021 Only',
'CV Avg Train R-Sq' = mean(cv2_result$`summary(h2s_da_model_f2b)$r.sq`),
'CV Avg Test R-Sq' = mean(cv2_result$adj_r2_test))))
# model 3: Exclude October 2021
exclude_oct2021 <- daily_full %>%
filter(year != '2021', month != '10') %>%
select(all_of(predictors), Monitor) %>%
filter(complete.cases(.))
monitors <- unique(exclude_oct2021$Monitor)
cv3_result <- tibble(Monitor = character(),
'Train Adj-R2' = numeric(),
'Train N' = numeric(),
'Test Adj-R2' = numeric(),
'Test B' = numeric()
)
for (i in 1:length(monitors)) {
train <- exclude_oct2021 %>%
filter(Monitor != monitors[i])
test <- exclude_oct2021 %>%
filter(Monitor == monitors[i])
h2s_da_model_f2b <- gam(H2S_daily_avg~as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints,
data = train)
predictions <- predict(h2s_da_model_f2b, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f2b)$n - 1)/
(summary(h2s_da_model_f2b)$n - summary(h2s_da_model_f2b)$np - 1)
cv3_result <- rbind(cv3_result, tibble(monitors[i],
summary(h2s_da_model_f2b)$r.sq,
summary(h2s_da_model_f2b)$n,
adj_r2_test,
nrow(test)))
}
result_cv <- rbind(result_cv, tibble(tibble(Model = 'Exclude October 2021',
'CV Avg Train R-Sq' = mean(cv3_result$`summary(h2s_da_model_f2b)$r.sq`),
'CV Avg Test R-Sq' = mean(cv3_result$adj_r2_test))))
# model 4: October 2021 Indicator
full_complete <- daily_full %>%
filter(year != '2021', month != '10') %>%
select(all_of(predictors), Monitor) %>%
filter(complete.cases(.)) %>%
mutate(october2021 = if_else(year == '2021', month == '10', 1, 0))
monitors <- unique(full_complete$Monitor)
cv4_result <- tibble(Monitor = character(),
'Train Adj-R2' = numeric(),
'Train N' = numeric(),
'Test Adj-R2' = numeric(),
'Test B' = numeric()
)
for (i in 1:length(monitors)) {
train <- full_complete %>%
filter(Monitor != monitors[i])
test <- full_complete %>%
filter(Monitor == monitors[i])
h2s_da_model_f2b <- gam(H2S_daily_avg~as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
october2021,
data = train)
predictions <- predict(h2s_da_model_f2b, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f2b)$n - 1)/
(summary(h2s_da_model_f2b)$n - summary(h2s_da_model_f2b)$np - 1)
cv4_result <- rbind(cv4_result, tibble(monitors[i],
summary(h2s_da_model_f2b)$r.sq,
summary(h2s_da_model_f2b)$n,
adj_r2_test,
nrow(test)))
}
result_cv <- rbind(result_cv, tibble(tibble(Model = 'October 2021 Indicator',
'CV Avg Train R-Sq' = mean(cv4_result$`summary(h2s_da_model_f2b)$r.sq`),
'CV Avg Test R-Sq' = mean(cv4_result$adj_r2_test))))
cv1_result
cv2_result
cv3_result
cv4_result
knitr::kable(tibble(Model = c('Since Feb 2022',
'October 2021 Only',
'Exclude October 2021',
'October 2021 Indicator'),
'Train R-sq' = c(round(summary(h2s_da_model_f)$r.sq, 2),
round(summary(h2s_da_model_g)$r.sq, 2),
round(summary(h2s_da_model_h)$r.sq, 2),
round(summary(h2s_da_model_i)$r.sq, 2))) %>%
left_join(result, join_by(Model)) %>%
left_join(result_cv, join_by(Model)))
| Since Feb 2022 |
0.60 |
0.5798024 |
0.6412991 |
0.6057831 |
0.1821341 |
| October 2021 Only |
0.43 |
0.3960353 |
-0.0016982 |
0.7539210 |
-0.4691843 |
| Exclude October 2021 |
0.58 |
0.5850159 |
0.5312950 |
0.5779813 |
0.0773571 |
| October 2021 Indicator |
0.25 |
0.2652137 |
0.5154652 |
0.5779813 |
0.0770660 |
Monthly
h2s_ma_model_a <- gam(H2S_monthly_average~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery, data = full_data)
summary(h2s_ma_model_a)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_monthly_average ~ s(as.numeric(month), bs = "cc") + year +
## wd_sec + ws + I(1/MinDist^2) + Refinery
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.299e+00 6.337e-01 3.628 0.000285 ***
## year2021 7.513e+00 3.587e-01 20.948 < 2e-16 ***
## year2022 -9.719e+00 3.990e-01 -24.359 < 2e-16 ***
## year2023 -4.519e+00 5.031e-01 -8.983 < 2e-16 ***
## wd_secQ2 -1.993e-01 3.750e-01 -0.531 0.595141
## wd_secQ3 3.380e+00 3.813e-01 8.865 < 2e-16 ***
## wd_secQ4 -1.999e+00 3.604e-01 -5.546 2.93e-08 ***
## ws 1.983e-01 4.050e-02 4.898 9.68e-07 ***
## I(1/MinDist^2) 5.375e+05 3.251e+05 1.653 0.098241 .
## RefineryMarathon (Carson) 4.189e+01 5.190e-01 80.710 < 2e-16 ***
## RefineryMarathon (Wilmington) 4.183e+00 6.036e-01 6.930 4.21e-12 ***
## RefineryPhillips 66 (Wilmington) 3.237e+00 5.332e-01 6.070 1.28e-09 ***
## RefineryTorrance Refinery -3.331e+00 4.910e-01 -6.785 1.16e-11 ***
## RefineryValero Refinery 6.077e+00 5.903e-01 10.294 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.999 8 4076 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0271 Deviance explained = 2.71%
## GCV = 24024 Scale est. = 24024 n = 2056378
plot(h2s_ma_model_a)

h2s_ma_model_b <- gam(H2S_monthly_average~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery+s(latitude.x, longitude.x, bs='tp', k=10), data = full_data)
summary(h2s_ma_model_b)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_monthly_average ~ s(as.numeric(month), bs = "cc") + year +
## wd_sec + ws + I(1/MinDist^2) + Refinery + s(latitude.x, longitude.x,
## bs = "tp", k = 10)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 247.73043 1330.12474 0.186 0.852252
## year2021 4.46631 0.34810 12.830 < 2e-16 ***
## year2022 -5.16925 0.39625 -13.045 < 2e-16 ***
## year2023 7.01499 0.49759 14.098 < 2e-16 ***
## wd_secQ2 0.75576 0.34603 2.184 0.028956 *
## wd_secQ3 0.04189 0.35248 0.119 0.905406
## wd_secQ4 -1.22491 0.33302 -3.678 0.000235 ***
## ws -0.01374 0.03742 -0.367 0.713562
## I(1/MinDist^2) 1.80216 0.32232 5.591 2.26e-08 ***
## RefineryMarathon (Carson) -768.08881 1478.49393 -0.520 0.603407
## RefineryMarathon (Wilmington) 93.90759 1452.37996 0.065 0.948447
## RefineryPhillips 66 (Wilmington) 0.48764 1513.24530 0.000 0.999743
## RefineryTorrance Refinery -241.61130 1400.50381 -0.173 0.863031
## RefineryValero Refinery -388.51087 1432.72236 -0.271 0.786261
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.999 8 4850 <2e-16 ***
## s(latitude.x,longitude.x) 7.000 7 51324 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 30/31
## R-sq.(adj) = 0.172 Deviance explained = 17.2%
## GCV = 20447 Scale est. = 20446 n = 2056378
plot(h2s_ma_model_b)


Try XGBoost on Daily Average
daily_full <- daily_full %>%
mutate(Refinery = str_replace_all(str_replace_all(Refinery, '[()]', ''), ' ', '_'),
Monitor = str_replace_all(Monitor, ' ', '_'),
weekday = as.character(weekday),
daily_downwind_ref = as.integer(daily_downwind_ref),
daily_downwind_wrp = as.integer(daily_downwind_wrp))
train <- daily_full[complete.cases(daily_full),] %>%
filter(day >= '2022-02-01')
train <- fastDummies::dummy_cols(train %>%
select(-c(Refinery, Monitor, day, active_2p5km, active_5km, monthly_oil_2p5km,
monthly_gas_2p5km, monthly_oil_5km, monthly_gas_5km, H2S_daily_max,
mon_utm_x, mon_utm_y, county)) %>%
mutate(MinDist = 1/(MinDist^2),
dist_wrp = 1/(dist_wrp^2)),
remove_selected_columns = TRUE)
# Try for a continuous month
tune_grid <- expand.grid(nrounds = c(100, 200, 500),
max_depth = c(3, 4, 5),
eta = c(0.1, 0.3),
gamma = c(0.01, 0.001),
colsample_bytree = c(0.5, 1),
min_child_weight = 0,
subsample = c(0.5, 0.75, 1))
# Run algorithms using 10-fold cross validation
control <- trainControl(method="cv",
number=10,
verboseIter=TRUE,
search='grid')
fit.xgb_da <- readRDS('rfiles/fit.xgb_da.rds')
# fit.xgb_da <- train(H2S_daily_avg~.,
# method = 'xgbTree',
# data = train,
# trControl=control,
# tuneGrid = tune_grid,
# tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da, 'rfiles/fit.xgb_da.rds')
getTrainPerf(fit.xgb_da)
fit.xgb_da$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 478.6 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "5", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 38
## niter: 200
## nfeatures : 38
## xNames : monitor_lat monitor_lon MinDist Converted_Angle monthly_oil_1km monthly_gas_1km dist_wrp capacity active_1km wrp_angle ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints year_2022 year_2023 month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 104 200 5 0.1 0.01 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da,scale=FALSE)
plot(imp, top=10)

SHAP for XGBoost above (Caret)
matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))
xgb.plot.shap(data = matrix,
model = fit.xgb_da$finalModel, top_n = 12, n_col = 3)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 43
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 2
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0

xgb.ggplot.shap.summary(data = matrix,
model = fit.xgb_da$finalModel, top_n = 10)

CV by leaving each monitor out once
post2022feb <- daily_full %>%
filter(day >= '2022-02-01')
monitor_names <- unique(post2022feb$Monitor)
#
# for (i in 1:length(monitor_names)) {
# train <- post2022feb %>%
# filter(Monitor != monitor_names[i])
#
# train <- train[complete.cases(train),]
#
# train <- fastDummies::dummy_cols(train %>%
# select(-c(Refinery, Monitor, day, active_2p5km, active_5km, monthly_oil_2p5km,
# monthly_gas_2p5km, monthly_oil_5km, monthly_gas_5km, H2S_daily_max,
# mon_utm_x, mon_utm_y, county)) %>%
# mutate(MinDist = 1/(MinDist^2),
# dist_wrp = 1/(dist_wrp^2)),
# remove_selected_columns = TRUE)
#
# fit.xgb_da <- train(H2S_daily_avg~.,
# method = 'xgbTree',
# data = train,
# trControl=control,
# tuneGrid = tune_grid,
# tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
#
# saveRDS(fit.xgb_da, paste0('rfiles/fit.xgb_da_', monitor_names[i], '.rds'))
# }
model_stats <- tibble(Monitor = character(), train_r2 = numeric(), test_r2 = numeric())
add_cols <- function(df, cols) {
add <- cols[!cols %in% names(df)]
if(length(add) !=0 ) df[add] <- NA
return(df)
}
for (i in setdiff(1:length(monitor_names), c(9))) {
fit.xgb_da <- readRDS(paste0('rfiles/fit.xgb_da_', monitor_names[i], '.rds'))
test <- post2022feb %>%
filter(Monitor == monitor_names[i])
test <- fastDummies::dummy_cols(test[complete.cases(test),] %>%
ungroup() %>%
select(-c(Refinery, Monitor, day, active_2p5km, active_5km, monthly_oil_2p5km,
monthly_gas_2p5km, monthly_oil_5km, monthly_gas_5km, H2S_daily_max,
mon_utm_x, mon_utm_y, county)) %>%
add_cols(c('year_2022', 'year_2023', 'month_01', 'month_02',
'month_03', 'month_04', 'month_05', 'month_06',
'month_07', 'month_08', 'month_09', 'month_10',
'month_11', 'month_12')) %>%
mutate(MinDist = 1/(MinDist^2),
dist_wrp = 1/(dist_wrp^2),
year_2022 = ifelse(is.na(year_2022), 0, year_2022),
year_2023 = ifelse(is.na(year_2023), 0, year_2023),
month_01 = ifelse(is.na(month_01), 0, month_01),
month_02 = ifelse(is.na(month_02), 0, month_02),
month_03 = ifelse(is.na(month_03), 0, month_03),
month_04 = ifelse(is.na(month_04), 0, month_04),
month_05 = ifelse(is.na(month_05), 0, month_05),
month_06 = ifelse(is.na(month_06), 0, month_06),
month_07 = ifelse(is.na(month_07), 0, month_07),
month_08 = ifelse(is.na(month_08), 0, month_08),
month_09 = ifelse(is.na(month_09), 0, month_09),
month_10 = ifelse(is.na(month_10), 0, month_10),
month_11 = ifelse(is.na(month_11), 0, month_11),
month_12 = ifelse(is.na(month_12), 0, month_12)),
remove_selected_columns = TRUE)
train_r2 <- getTrainPerf(fit.xgb_da)$TrainRsquared
predictions <- predict(fit.xgb_da$finalModel,
newdata = test %>% select(-H2S_daily_avg) %>% as.matrix())
test_r2 <- R2(test %>% pull(H2S_daily_avg), predictions)
model_stats <- rbind(model_stats, tibble(Monitor = monitor_names[i], train_r2, test_r2))
}
model_stats %>% rbind(tibble(Monitor = 'Total Average', train_r2 = mean(model_stats$train_r2, na.rm = T),
test_r2 = mean(model_stats$test_r2, na.rm = T)))